{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:\n",
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\.libs\\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll\n",
      "d:\\Miniconda\\envs\\geo\\lib\\site-packages\\numpy\\.libs\\libopenblas64__v0.3.21-gcc_10_3_0.dll\n",
      "  warnings.warn(\"loaded more than 1 DLL from .libs:\"\n"
     ]
    }
   ],
   "source": [
    "from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING,CONFIG_TO_TYPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'transformers.models.albert.configuration_albert.AlbertConfig'> (<class 'transformers.models.albert.tokenization_albert.AlbertTokenizer'>, <class 'transformers.models.albert.tokenization_albert_fast.AlbertTokenizerFast'>)\n",
      "<class 'transformers.models.align.configuration_align.AlignConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.bark.configuration_bark.BarkConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.bart.configuration_bart.BartConfig'> (<class 'transformers.models.bart.tokenization_bart.BartTokenizer'>, <class 'transformers.models.bart.tokenization_bart_fast.BartTokenizerFast'>)\n",
      "<class 'transformers.models.bert.configuration_bert.BertConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.bert_generation.configuration_bert_generation.BertGenerationConfig'> (<class 'transformers.models.bert_generation.tokenization_bert_generation.BertGenerationTokenizer'>, None)\n",
      "<class 'transformers.models.big_bird.configuration_big_bird.BigBirdConfig'> (<class 'transformers.models.big_bird.tokenization_big_bird.BigBirdTokenizer'>, <class 'transformers.models.big_bird.tokenization_big_bird_fast.BigBirdTokenizerFast'>)\n",
      "<class 'transformers.models.bigbird_pegasus.configuration_bigbird_pegasus.BigBirdPegasusConfig'> (<class 'transformers.models.pegasus.tokenization_pegasus.PegasusTokenizer'>, <class 'transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast'>)\n",
      "<class 'transformers.models.biogpt.configuration_biogpt.BioGptConfig'> (<class 'transformers.models.biogpt.tokenization_biogpt.BioGptTokenizer'>, None)\n",
      "<class 'transformers.models.blenderbot.configuration_blenderbot.BlenderbotConfig'> (<class 'transformers.models.blenderbot.tokenization_blenderbot.BlenderbotTokenizer'>, <class 'transformers.models.blenderbot.tokenization_blenderbot_fast.BlenderbotTokenizerFast'>)\n",
      "<class 'transformers.models.blenderbot_small.configuration_blenderbot_small.BlenderbotSmallConfig'> (<class 'transformers.models.blenderbot_small.tokenization_blenderbot_small.BlenderbotSmallTokenizer'>, None)\n",
      "<class 'transformers.models.blip.configuration_blip.BlipConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.blip_2.configuration_blip_2.Blip2Config'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.bloom.configuration_bloom.BloomConfig'> (None, <class 'transformers.models.bloom.tokenization_bloom_fast.BloomTokenizerFast'>)\n",
      "<class 'transformers.models.bridgetower.configuration_bridgetower.BridgeTowerConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.bros.configuration_bros.BrosConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.camembert.configuration_camembert.CamembertConfig'> (<class 'transformers.models.camembert.tokenization_camembert.CamembertTokenizer'>, <class 'transformers.models.camembert.tokenization_camembert_fast.CamembertTokenizerFast'>)\n",
      "<class 'transformers.models.canine.configuration_canine.CanineConfig'> (<class 'transformers.models.canine.tokenization_canine.CanineTokenizer'>, None)\n",
      "<class 'transformers.models.chameleon.configuration_chameleon.ChameleonConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.chinese_clip.configuration_chinese_clip.ChineseCLIPConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.clap.configuration_clap.ClapConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.clip.configuration_clip.CLIPConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.clipseg.configuration_clipseg.CLIPSegConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.clvp.configuration_clvp.ClvpConfig'> (<class 'transformers.models.clvp.tokenization_clvp.ClvpTokenizer'>, None)\n",
      "<class 'transformers.models.llama.configuration_llama.LlamaConfig'> (<class 'transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer'>, <class 'transformers.models.code_llama.tokenization_code_llama_fast.CodeLlamaTokenizerFast'>)\n",
      "<class 'transformers.models.codegen.configuration_codegen.CodeGenConfig'> (<class 'transformers.models.codegen.tokenization_codegen.CodeGenTokenizer'>, <class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>)\n",
      "<class 'transformers.models.cohere.configuration_cohere.CohereConfig'> (None, <class 'transformers.models.cohere.tokenization_cohere_fast.CohereTokenizerFast'>)\n",
      "<class 'transformers.models.convbert.configuration_convbert.ConvBertConfig'> (<class 'transformers.models.convbert.tokenization_convbert.ConvBertTokenizer'>, <class 'transformers.models.convbert.tokenization_convbert_fast.ConvBertTokenizerFast'>)\n",
      "<class 'transformers.models.cpmant.configuration_cpmant.CpmAntConfig'> (<class 'transformers.models.cpmant.tokenization_cpmant.CpmAntTokenizer'>, None)\n",
      "<class 'transformers.models.ctrl.configuration_ctrl.CTRLConfig'> (<class 'transformers.models.ctrl.tokenization_ctrl.CTRLTokenizer'>, None)\n",
      "<class 'transformers.models.data2vec.configuration_data2vec_audio.Data2VecAudioConfig'> (<class 'transformers.models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizer'>, None)\n",
      "<class 'transformers.models.data2vec.configuration_data2vec_text.Data2VecTextConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.dbrx.configuration_dbrx.DbrxConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.deberta.configuration_deberta.DebertaConfig'> (<class 'transformers.models.deberta.tokenization_deberta.DebertaTokenizer'>, <class 'transformers.models.deberta.tokenization_deberta_fast.DebertaTokenizerFast'>)\n",
      "<class 'transformers.models.deberta_v2.configuration_deberta_v2.DebertaV2Config'> (<class 'transformers.models.deberta_v2.tokenization_deberta_v2.DebertaV2Tokenizer'>, <class 'transformers.models.deberta_v2.tokenization_deberta_v2_fast.DebertaV2TokenizerFast'>)\n",
      "<class 'transformers.models.distilbert.configuration_distilbert.DistilBertConfig'> (<class 'transformers.models.distilbert.tokenization_distilbert.DistilBertTokenizer'>, <class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>)\n",
      "<class 'transformers.models.dpr.configuration_dpr.DPRConfig'> (<class 'transformers.models.dpr.tokenization_dpr.DPRQuestionEncoderTokenizer'>, <class 'transformers.models.dpr.tokenization_dpr_fast.DPRQuestionEncoderTokenizerFast'>)\n",
      "<class 'transformers.models.electra.configuration_electra.ElectraConfig'> (<class 'transformers.models.electra.tokenization_electra.ElectraTokenizer'>, <class 'transformers.models.electra.tokenization_electra_fast.ElectraTokenizerFast'>)\n",
      "<class 'transformers.models.ernie.configuration_ernie.ErnieConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.ernie_m.configuration_ernie_m.ErnieMConfig'> (<class 'transformers.models.deprecated.ernie_m.tokenization_ernie_m.ErnieMTokenizer'>, None)\n",
      "<class 'transformers.models.esm.configuration_esm.EsmConfig'> (<class 'transformers.models.esm.tokenization_esm.EsmTokenizer'>, None)\n",
      "<class 'transformers.models.falcon.configuration_falcon.FalconConfig'> (None, <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>)\n",
      "<class 'transformers.models.fastspeech2_conformer.configuration_fastspeech2_conformer.FastSpeech2ConformerConfig'> (None, None)\n",
      "<class 'transformers.models.flaubert.configuration_flaubert.FlaubertConfig'> (<class 'transformers.models.flaubert.tokenization_flaubert.FlaubertTokenizer'>, None)\n",
      "<class 'transformers.models.fnet.configuration_fnet.FNetConfig'> (<class 'transformers.models.fnet.tokenization_fnet.FNetTokenizer'>, <class 'transformers.models.fnet.tokenization_fnet_fast.FNetTokenizerFast'>)\n",
      "<class 'transformers.models.fsmt.configuration_fsmt.FSMTConfig'> (<class 'transformers.models.fsmt.tokenization_fsmt.FSMTTokenizer'>, None)\n",
      "<class 'transformers.models.funnel.configuration_funnel.FunnelConfig'> (<class 'transformers.models.funnel.tokenization_funnel.FunnelTokenizer'>, <class 'transformers.models.funnel.tokenization_funnel_fast.FunnelTokenizerFast'>)\n",
      "<class 'transformers.models.gemma.configuration_gemma.GemmaConfig'> (<class 'transformers.models.gemma.tokenization_gemma.GemmaTokenizer'>, <class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>)\n",
      "<class 'transformers.models.gemma2.configuration_gemma2.Gemma2Config'> (<class 'transformers.models.gemma.tokenization_gemma.GemmaTokenizer'>, <class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>)\n",
      "<class 'transformers.models.git.configuration_git.GitConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> (<class 'transformers.models.gpt_sw3.tokenization_gpt_sw3.GPTSw3Tokenizer'>, None)\n",
      "<class 'transformers.models.gpt2.configuration_gpt2.GPT2Config'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.gpt_bigcode.configuration_gpt_bigcode.GPTBigCodeConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.gpt_neo.configuration_gpt_neo.GPTNeoConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.gpt_neox.configuration_gpt_neox.GPTNeoXConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.gpt_neox_japanese.configuration_gpt_neox_japanese.GPTNeoXJapaneseConfig'> (<class 'transformers.models.gpt_neox_japanese.tokenization_gpt_neox_japanese.GPTNeoXJapaneseTokenizer'>, None)\n",
      "<class 'transformers.models.gptj.configuration_gptj.GPTJConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.gptsan_japanese.configuration_gptsan_japanese.GPTSanJapaneseConfig'> (<class 'transformers.models.deprecated.gptsan_japanese.tokenization_gptsan_japanese.GPTSanJapaneseTokenizer'>, None)\n",
      "<class 'transformers.models.grounding_dino.configuration_grounding_dino.GroundingDinoConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.groupvit.configuration_groupvit.GroupViTConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.hubert.configuration_hubert.HubertConfig'> (<class 'transformers.models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizer'>, None)\n",
      "<class 'transformers.models.ibert.configuration_ibert.IBertConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.idefics.configuration_idefics.IdeficsConfig'> (None, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.idefics2.configuration_idefics2.Idefics2Config'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.instructblip.configuration_instructblip.InstructBlipConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.instructblipvideo.configuration_instructblipvideo.InstructBlipVideoConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.jamba.configuration_jamba.JambaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.jetmoe.configuration_jetmoe.JetMoeConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.jukebox.configuration_jukebox.JukeboxConfig'> (<class 'transformers.models.deprecated.jukebox.tokenization_jukebox.JukeboxTokenizer'>, None)\n",
      "<class 'transformers.models.kosmos2.configuration_kosmos2.Kosmos2Config'> (<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer'>, <class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>)\n",
      "<class 'transformers.models.layoutlm.configuration_layoutlm.LayoutLMConfig'> (<class 'transformers.models.layoutlm.tokenization_layoutlm.LayoutLMTokenizer'>, <class 'transformers.models.layoutlm.tokenization_layoutlm_fast.LayoutLMTokenizerFast'>)\n",
      "<class 'transformers.models.layoutlmv2.configuration_layoutlmv2.LayoutLMv2Config'> (<class 'transformers.models.layoutlmv2.tokenization_layoutlmv2.LayoutLMv2Tokenizer'>, <class 'transformers.models.layoutlmv2.tokenization_layoutlmv2_fast.LayoutLMv2TokenizerFast'>)\n",
      "<class 'transformers.models.layoutlmv3.configuration_layoutlmv3.LayoutLMv3Config'> (<class 'transformers.models.layoutlmv3.tokenization_layoutlmv3.LayoutLMv3Tokenizer'>, <class 'transformers.models.layoutlmv3.tokenization_layoutlmv3_fast.LayoutLMv3TokenizerFast'>)\n",
      "<class 'transformers.models.led.configuration_led.LEDConfig'> (<class 'transformers.models.led.tokenization_led.LEDTokenizer'>, <class 'transformers.models.led.tokenization_led_fast.LEDTokenizerFast'>)\n",
      "<class 'transformers.models.lilt.configuration_lilt.LiltConfig'> (<class 'transformers.models.layoutlmv3.tokenization_layoutlmv3.LayoutLMv3Tokenizer'>, <class 'transformers.models.layoutlmv3.tokenization_layoutlmv3_fast.LayoutLMv3TokenizerFast'>)\n",
      "<class 'transformers.models.llama.configuration_llama.LlamaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.llava.configuration_llava.LlavaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.llava_next_video.configuration_llava_next_video.LlavaNextVideoConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.llava_next.configuration_llava_next.LlavaNextConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.longformer.configuration_longformer.LongformerConfig'> (<class 'transformers.models.longformer.tokenization_longformer.LongformerTokenizer'>, <class 'transformers.models.longformer.tokenization_longformer_fast.LongformerTokenizerFast'>)\n",
      "<class 'transformers.models.longt5.configuration_longt5.LongT5Config'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.luke.configuration_luke.LukeConfig'> (<class 'transformers.models.luke.tokenization_luke.LukeTokenizer'>, None)\n",
      "<class 'transformers.models.lxmert.configuration_lxmert.LxmertConfig'> (<class 'transformers.models.lxmert.tokenization_lxmert.LxmertTokenizer'>, <class 'transformers.models.lxmert.tokenization_lxmert_fast.LxmertTokenizerFast'>)\n",
      "<class 'transformers.models.m2m_100.configuration_m2m_100.M2M100Config'> (<class 'transformers.models.m2m_100.tokenization_m2m_100.M2M100Tokenizer'>, None)\n",
      "<class 'transformers.models.mamba.configuration_mamba.MambaConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.marian.configuration_marian.MarianConfig'> (<class 'transformers.models.marian.tokenization_marian.MarianTokenizer'>, None)\n",
      "<class 'transformers.models.mbart.configuration_mbart.MBartConfig'> (<class 'transformers.models.mbart.tokenization_mbart.MBartTokenizer'>, <class 'transformers.models.mbart.tokenization_mbart_fast.MBartTokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.mega.configuration_mega.MegaConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.megatron_bert.configuration_megatron_bert.MegatronBertConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.mgp_str.configuration_mgp_str.MgpstrConfig'> (<class 'transformers.models.mgp_str.tokenization_mgp_str.MgpstrTokenizer'>, None)\n",
      "<class 'transformers.models.mistral.configuration_mistral.MistralConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.mixtral.configuration_mixtral.MixtralConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.mobilebert.configuration_mobilebert.MobileBertConfig'> (<class 'transformers.models.mobilebert.tokenization_mobilebert.MobileBertTokenizer'>, <class 'transformers.models.mobilebert.tokenization_mobilebert_fast.MobileBertTokenizerFast'>)\n",
      "<class 'transformers.models.mpnet.configuration_mpnet.MPNetConfig'> (<class 'transformers.models.mpnet.tokenization_mpnet.MPNetTokenizer'>, <class 'transformers.models.mpnet.tokenization_mpnet_fast.MPNetTokenizerFast'>)\n",
      "<class 'transformers.models.mpt.configuration_mpt.MptConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.mra.configuration_mra.MraConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.mt5.configuration_mt5.MT5Config'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.musicgen.configuration_musicgen.MusicgenConfig'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.musicgen_melody.configuration_musicgen_melody.MusicgenMelodyConfig'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.mvp.configuration_mvp.MvpConfig'> (<class 'transformers.models.mvp.tokenization_mvp.MvpTokenizer'>, <class 'transformers.models.mvp.tokenization_mvp_fast.MvpTokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.nezha.configuration_nezha.NezhaConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.nllb_moe.configuration_nllb_moe.NllbMoeConfig'> (<class 'transformers.models.nllb.tokenization_nllb.NllbTokenizer'>, <class 'transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast'>)\n",
      "<class 'transformers.models.nystromformer.configuration_nystromformer.NystromformerConfig'> (<class 'transformers.models.albert.tokenization_albert.AlbertTokenizer'>, <class 'transformers.models.albert.tokenization_albert_fast.AlbertTokenizerFast'>)\n",
      "<class 'transformers.models.olmo.configuration_olmo.OlmoConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.oneformer.configuration_oneformer.OneFormerConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.openai.configuration_openai.OpenAIGPTConfig'> (<class 'transformers.models.openai.tokenization_openai.OpenAIGPTTokenizer'>, <class 'transformers.models.openai.tokenization_openai_fast.OpenAIGPTTokenizerFast'>)\n",
      "<class 'transformers.models.opt.configuration_opt.OPTConfig'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.owlv2.configuration_owlv2.Owlv2Config'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.owlvit.configuration_owlvit.OwlViTConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.paligemma.configuration_paligemma.PaliGemmaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.pegasus.configuration_pegasus.PegasusConfig'> (<class 'transformers.models.pegasus.tokenization_pegasus.PegasusTokenizer'>, <class 'transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast'>)\n",
      "<class 'transformers.models.pegasus_x.configuration_pegasus_x.PegasusXConfig'> (<class 'transformers.models.pegasus.tokenization_pegasus.PegasusTokenizer'>, <class 'transformers.models.pegasus.tokenization_pegasus_fast.PegasusTokenizerFast'>)\n",
      "<class 'transformers.models.perceiver.configuration_perceiver.PerceiverConfig'> (<class 'transformers.models.perceiver.tokenization_perceiver.PerceiverTokenizer'>, None)\n",
      "<class 'transformers.models.persimmon.configuration_persimmon.PersimmonConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.phi.configuration_phi.PhiConfig'> (<class 'transformers.models.codegen.tokenization_codegen.CodeGenTokenizer'>, <class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>)\n",
      "<class 'transformers.models.phi3.configuration_phi3.Phi3Config'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.pix2struct.configuration_pix2struct.Pix2StructConfig'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.plbart.configuration_plbart.PLBartConfig'> (<class 'transformers.models.plbart.tokenization_plbart.PLBartTokenizer'>, None)\n",
      "<class 'transformers.models.prophetnet.configuration_prophetnet.ProphetNetConfig'> (<class 'transformers.models.prophetnet.tokenization_prophetnet.ProphetNetTokenizer'>, None)\n",
      "<class 'transformers.models.deprecated.qdqbert.configuration_qdqbert.QDQBertConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.qwen2.configuration_qwen2.Qwen2Config'> (<class 'transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer'>, <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>)\n",
      "<class 'transformers.models.qwen2_moe.configuration_qwen2_moe.Qwen2MoeConfig'> (<class 'transformers.models.qwen2.tokenization_qwen2.Qwen2Tokenizer'>, <class 'transformers.models.qwen2.tokenization_qwen2_fast.Qwen2TokenizerFast'>)\n",
      "<class 'transformers.models.rag.configuration_rag.RagConfig'> (<class 'transformers.models.rag.tokenization_rag.RagTokenizer'>, None)\n",
      "<class 'transformers.models.deprecated.realm.configuration_realm.RealmConfig'> (<class 'transformers.models.deprecated.realm.tokenization_realm.RealmTokenizer'>, <class 'transformers.models.deprecated.realm.tokenization_realm_fast.RealmTokenizerFast'>)\n",
      "<class 'transformers.models.recurrent_gemma.configuration_recurrent_gemma.RecurrentGemmaConfig'> (<class 'transformers.models.gemma.tokenization_gemma.GemmaTokenizer'>, <class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>)\n",
      "<class 'transformers.models.reformer.configuration_reformer.ReformerConfig'> (<class 'transformers.models.reformer.tokenization_reformer.ReformerTokenizer'>, <class 'transformers.models.reformer.tokenization_reformer_fast.ReformerTokenizerFast'>)\n",
      "<class 'transformers.models.rembert.configuration_rembert.RemBertConfig'> (<class 'transformers.models.rembert.tokenization_rembert.RemBertTokenizer'>, <class 'transformers.models.rembert.tokenization_rembert_fast.RemBertTokenizerFast'>)\n",
      "<class 'transformers.models.deprecated.retribert.configuration_retribert.RetriBertConfig'> (<class 'transformers.models.deprecated.retribert.tokenization_retribert.RetriBertTokenizer'>, <class 'transformers.models.deprecated.retribert.tokenization_retribert_fast.RetriBertTokenizerFast'>)\n",
      "<class 'transformers.models.roberta.configuration_roberta.RobertaConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.roberta_prelayernorm.configuration_roberta_prelayernorm.RobertaPreLayerNormConfig'> (<class 'transformers.models.roberta.tokenization_roberta.RobertaTokenizer'>, <class 'transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast'>)\n",
      "<class 'transformers.models.roc_bert.configuration_roc_bert.RoCBertConfig'> (<class 'transformers.models.roc_bert.tokenization_roc_bert.RoCBertTokenizer'>, None)\n",
      "<class 'transformers.models.roformer.configuration_roformer.RoFormerConfig'> (<class 'transformers.models.roformer.tokenization_roformer.RoFormerTokenizer'>, <class 'transformers.models.roformer.tokenization_roformer_fast.RoFormerTokenizerFast'>)\n",
      "<class 'transformers.models.rwkv.configuration_rwkv.RwkvConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.seamless_m4t.configuration_seamless_m4t.SeamlessM4TConfig'> (<class 'transformers.models.seamless_m4t.tokenization_seamless_m4t.SeamlessM4TTokenizer'>, <class 'transformers.models.seamless_m4t.tokenization_seamless_m4t_fast.SeamlessM4TTokenizerFast'>)\n",
      "<class 'transformers.models.seamless_m4t_v2.configuration_seamless_m4t_v2.SeamlessM4Tv2Config'> (<class 'transformers.models.seamless_m4t.tokenization_seamless_m4t.SeamlessM4TTokenizer'>, <class 'transformers.models.seamless_m4t.tokenization_seamless_m4t_fast.SeamlessM4TTokenizerFast'>)\n",
      "<class 'transformers.models.siglip.configuration_siglip.SiglipConfig'> (<class 'transformers.models.siglip.tokenization_siglip.SiglipTokenizer'>, None)\n",
      "<class 'transformers.models.speech_to_text.configuration_speech_to_text.Speech2TextConfig'> (<class 'transformers.models.speech_to_text.tokenization_speech_to_text.Speech2TextTokenizer'>, None)\n",
      "<class 'transformers.models.deprecated.speech_to_text_2.configuration_speech_to_text_2.Speech2Text2Config'> (<class 'transformers.models.deprecated.speech_to_text_2.tokenization_speech_to_text_2.Speech2Text2Tokenizer'>, None)\n",
      "<class 'transformers.models.speecht5.configuration_speecht5.SpeechT5Config'> (<class 'transformers.models.speecht5.tokenization_speecht5.SpeechT5Tokenizer'>, None)\n",
      "<class 'transformers.models.splinter.configuration_splinter.SplinterConfig'> (<class 'transformers.models.splinter.tokenization_splinter.SplinterTokenizer'>, <class 'transformers.models.splinter.tokenization_splinter_fast.SplinterTokenizerFast'>)\n",
      "<class 'transformers.models.squeezebert.configuration_squeezebert.SqueezeBertConfig'> (<class 'transformers.models.squeezebert.tokenization_squeezebert.SqueezeBertTokenizer'>, <class 'transformers.models.squeezebert.tokenization_squeezebert_fast.SqueezeBertTokenizerFast'>)\n",
      "<class 'transformers.models.stablelm.configuration_stablelm.StableLmConfig'> (None, <class 'transformers.models.gpt_neox.tokenization_gpt_neox_fast.GPTNeoXTokenizerFast'>)\n",
      "<class 'transformers.models.starcoder2.configuration_starcoder2.Starcoder2Config'> (<class 'transformers.models.gpt2.tokenization_gpt2.GPT2Tokenizer'>, <class 'transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast'>)\n",
      "<class 'transformers.models.switch_transformers.configuration_switch_transformers.SwitchTransformersConfig'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.t5.configuration_t5.T5Config'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.tapas.configuration_tapas.TapasConfig'> (<class 'transformers.models.tapas.tokenization_tapas.TapasTokenizer'>, None)\n",
      "<class 'transformers.models.deprecated.transfo_xl.configuration_transfo_xl.TransfoXLConfig'> (<class 'transformers.models.deprecated.transfo_xl.tokenization_transfo_xl.TransfoXLTokenizer'>, None)\n",
      "<class 'transformers.models.tvp.configuration_tvp.TvpConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.udop.configuration_udop.UdopConfig'> (<class 'transformers.models.udop.tokenization_udop.UdopTokenizer'>, <class 'transformers.models.udop.tokenization_udop_fast.UdopTokenizerFast'>)\n",
      "<class 'transformers.models.umt5.configuration_umt5.UMT5Config'> (<class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>, <class 'transformers.models.t5.tokenization_t5_fast.T5TokenizerFast'>)\n",
      "<class 'transformers.models.video_llava.configuration_video_llava.VideoLlavaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.vilt.configuration_vilt.ViltConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.vipllava.configuration_vipllava.VipLlavaConfig'> (<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n",
      "<class 'transformers.models.visual_bert.configuration_visual_bert.VisualBertConfig'> (<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>, <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>)\n",
      "<class 'transformers.models.vits.configuration_vits.VitsConfig'> (<class 'transformers.models.vits.tokenization_vits.VitsTokenizer'>, None)\n",
      "<class 'transformers.models.wav2vec2.configuration_wav2vec2.Wav2Vec2Config'> (<class 'transformers.models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizer'>, None)\n",
      "<class 'transformers.models.wav2vec2_bert.configuration_wav2vec2_bert.Wav2Vec2BertConfig'> (<class 'transformers.models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizer'>, None)\n",
      "<class 'transformers.models.wav2vec2_conformer.configuration_wav2vec2_conformer.Wav2Vec2ConformerConfig'> (<class 'transformers.models.wav2vec2.tokenization_wav2vec2.Wav2Vec2CTCTokenizer'>, None)\n",
      "<class 'transformers.models.whisper.configuration_whisper.WhisperConfig'> (<class 'transformers.models.whisper.tokenization_whisper.WhisperTokenizer'>, <class 'transformers.models.whisper.tokenization_whisper_fast.WhisperTokenizerFast'>)\n",
      "<class 'transformers.models.x_clip.configuration_x_clip.XCLIPConfig'> (<class 'transformers.models.clip.tokenization_clip.CLIPTokenizer'>, <class 'transformers.models.clip.tokenization_clip_fast.CLIPTokenizerFast'>)\n",
      "<class 'transformers.models.xglm.configuration_xglm.XGLMConfig'> (<class 'transformers.models.xglm.tokenization_xglm.XGLMTokenizer'>, <class 'transformers.models.xglm.tokenization_xglm_fast.XGLMTokenizerFast'>)\n",
      "<class 'transformers.models.xlm.configuration_xlm.XLMConfig'> (<class 'transformers.models.xlm.tokenization_xlm.XLMTokenizer'>, None)\n",
      "<class 'transformers.models.deprecated.xlm_prophetnet.configuration_xlm_prophetnet.XLMProphetNetConfig'> (<class 'transformers.models.deprecated.xlm_prophetnet.tokenization_xlm_prophetnet.XLMProphetNetTokenizer'>, None)\n",
      "<class 'transformers.models.xlm_roberta.configuration_xlm_roberta.XLMRobertaConfig'> (<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer'>, <class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>)\n",
      "<class 'transformers.models.xlm_roberta_xl.configuration_xlm_roberta_xl.XLMRobertaXLConfig'> (<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer'>, <class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>)\n",
      "<class 'transformers.models.xlnet.configuration_xlnet.XLNetConfig'> (<class 'transformers.models.xlnet.tokenization_xlnet.XLNetTokenizer'>, <class 'transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast'>)\n",
      "<class 'transformers.models.xmod.configuration_xmod.XmodConfig'> (<class 'transformers.models.xlm_roberta.tokenization_xlm_roberta.XLMRobertaTokenizer'>, <class 'transformers.models.xlm_roberta.tokenization_xlm_roberta_fast.XLMRobertaTokenizerFast'>)\n",
      "<class 'transformers.models.yoso.configuration_yoso.YosoConfig'> (<class 'transformers.models.albert.tokenization_albert.AlbertTokenizer'>, <class 'transformers.models.albert.tokenization_albert_fast.AlbertTokenizerFast'>)\n"
     ]
    }
   ],
   "source": [
    "for key ,v in TOKENIZER_MAPPING.items():\n",
    "    print(key,v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'AlbertConfig': 'albert',\n",
       " 'AlignConfig': 'align',\n",
       " 'AltCLIPConfig': 'altclip',\n",
       " 'ASTConfig': 'audio-spectrogram-transformer',\n",
       " 'AutoformerConfig': 'autoformer',\n",
       " 'BarkConfig': 'bark',\n",
       " 'BartConfig': 'bart',\n",
       " 'BeitConfig': 'beit',\n",
       " 'BertConfig': 'bert',\n",
       " 'BertGenerationConfig': 'bert-generation',\n",
       " 'BigBirdConfig': 'big_bird',\n",
       " 'BigBirdPegasusConfig': 'bigbird_pegasus',\n",
       " 'BioGptConfig': 'biogpt',\n",
       " 'BitConfig': 'bit',\n",
       " 'BlenderbotConfig': 'blenderbot',\n",
       " 'BlenderbotSmallConfig': 'blenderbot-small',\n",
       " 'BlipConfig': 'blip',\n",
       " 'Blip2Config': 'blip-2',\n",
       " 'BloomConfig': 'bloom',\n",
       " 'BridgeTowerConfig': 'bridgetower',\n",
       " 'BrosConfig': 'bros',\n",
       " 'CamembertConfig': 'camembert',\n",
       " 'CanineConfig': 'canine',\n",
       " 'ChameleonConfig': 'chameleon',\n",
       " 'ChineseCLIPConfig': 'chinese_clip',\n",
       " 'ChineseCLIPVisionConfig': 'chinese_clip_vision_model',\n",
       " 'ClapConfig': 'clap',\n",
       " 'CLIPConfig': 'clip',\n",
       " 'CLIPVisionConfig': 'clip_vision_model',\n",
       " 'CLIPSegConfig': 'clipseg',\n",
       " 'ClvpConfig': 'clvp',\n",
       " 'LlamaConfig': 'llama',\n",
       " 'CodeGenConfig': 'codegen',\n",
       " 'CohereConfig': 'cohere',\n",
       " 'ConditionalDetrConfig': 'conditional_detr',\n",
       " 'ConvBertConfig': 'convbert',\n",
       " 'ConvNextConfig': 'convnext',\n",
       " 'ConvNextV2Config': 'convnextv2',\n",
       " 'CpmAntConfig': 'cpmant',\n",
       " 'CTRLConfig': 'ctrl',\n",
       " 'CvtConfig': 'cvt',\n",
       " 'Data2VecAudioConfig': 'data2vec-audio',\n",
       " 'Data2VecTextConfig': 'data2vec-text',\n",
       " 'Data2VecVisionConfig': 'data2vec-vision',\n",
       " 'DbrxConfig': 'dbrx',\n",
       " 'DebertaConfig': 'deberta',\n",
       " 'DebertaV2Config': 'deberta-v2',\n",
       " 'DecisionTransformerConfig': 'decision_transformer',\n",
       " 'DeformableDetrConfig': 'deformable_detr',\n",
       " 'DeiTConfig': 'deit',\n",
       " 'DepthAnythingConfig': 'depth_anything',\n",
       " 'DetaConfig': 'deta',\n",
       " 'DetrConfig': 'detr',\n",
       " 'DinatConfig': 'dinat',\n",
       " 'Dinov2Config': 'dinov2',\n",
       " 'DistilBertConfig': 'distilbert',\n",
       " 'DonutSwinConfig': 'donut-swin',\n",
       " 'DPRConfig': 'dpr',\n",
       " 'DPTConfig': 'dpt',\n",
       " 'EfficientFormerConfig': 'efficientformer',\n",
       " 'EfficientNetConfig': 'efficientnet',\n",
       " 'ElectraConfig': 'electra',\n",
       " 'EncodecConfig': 'encodec',\n",
       " 'EncoderDecoderConfig': 'encoder-decoder',\n",
       " 'ErnieConfig': 'ernie',\n",
       " 'ErnieMConfig': 'ernie_m',\n",
       " 'EsmConfig': 'esm',\n",
       " 'FalconConfig': 'falcon',\n",
       " 'FastSpeech2ConformerConfig': 'fastspeech2_conformer',\n",
       " 'FlaubertConfig': 'flaubert',\n",
       " 'FlavaConfig': 'flava',\n",
       " 'FNetConfig': 'fnet',\n",
       " 'FocalNetConfig': 'focalnet',\n",
       " 'FSMTConfig': 'fsmt',\n",
       " 'FunnelConfig': 'funnel',\n",
       " 'FuyuConfig': 'fuyu',\n",
       " 'GemmaConfig': 'gemma',\n",
       " 'Gemma2Config': 'gemma2',\n",
       " 'GitConfig': 'git',\n",
       " 'GLPNConfig': 'glpn',\n",
       " 'GPT2Config': 'gpt2',\n",
       " 'GPTBigCodeConfig': 'gpt_bigcode',\n",
       " 'GPTNeoConfig': 'gpt_neo',\n",
       " 'GPTNeoXConfig': 'gpt_neox',\n",
       " 'GPTNeoXJapaneseConfig': 'gpt_neox_japanese',\n",
       " 'GPTJConfig': 'gptj',\n",
       " 'GPTSanJapaneseConfig': 'gptsan-japanese',\n",
       " 'GraphormerConfig': 'graphormer',\n",
       " 'GroundingDinoConfig': 'grounding-dino',\n",
       " 'GroupViTConfig': 'groupvit',\n",
       " 'HieraConfig': 'hiera',\n",
       " 'HubertConfig': 'hubert',\n",
       " 'IBertConfig': 'ibert',\n",
       " 'IdeficsConfig': 'idefics',\n",
       " 'Idefics2Config': 'idefics2',\n",
       " 'ImageGPTConfig': 'imagegpt',\n",
       " 'InformerConfig': 'informer',\n",
       " 'InstructBlipConfig': 'instructblip',\n",
       " 'InstructBlipVideoConfig': 'instructblipvideo',\n",
       " 'JambaConfig': 'jamba',\n",
       " 'JetMoeConfig': 'jetmoe',\n",
       " 'JukeboxConfig': 'jukebox',\n",
       " 'Kosmos2Config': 'kosmos-2',\n",
       " 'LayoutLMConfig': 'layoutlm',\n",
       " 'LayoutLMv2Config': 'layoutlmv2',\n",
       " 'LayoutLMv3Config': 'layoutlmv3',\n",
       " 'LEDConfig': 'led',\n",
       " 'LevitConfig': 'levit',\n",
       " 'LiltConfig': 'lilt',\n",
       " 'LlavaConfig': 'llava',\n",
       " 'LlavaNextVideoConfig': 'llava-next-video',\n",
       " 'LlavaNextConfig': 'llava_next',\n",
       " 'LongformerConfig': 'longformer',\n",
       " 'LongT5Config': 'longt5',\n",
       " 'LukeConfig': 'luke',\n",
       " 'LxmertConfig': 'lxmert',\n",
       " 'M2M100Config': 'm2m_100',\n",
       " 'MambaConfig': 'mamba',\n",
       " 'MarianConfig': 'marian',\n",
       " 'MarkupLMConfig': 'markuplm',\n",
       " 'Mask2FormerConfig': 'mask2former',\n",
       " 'MaskFormerConfig': 'maskformer',\n",
       " 'MaskFormerSwinConfig': 'maskformer-swin',\n",
       " 'MBartConfig': 'mbart',\n",
       " 'MCTCTConfig': 'mctct',\n",
       " 'MegaConfig': 'mega',\n",
       " 'MegatronBertConfig': 'megatron-bert',\n",
       " 'MgpstrConfig': 'mgp-str',\n",
       " 'MistralConfig': 'mistral',\n",
       " 'MixtralConfig': 'mixtral',\n",
       " 'MobileBertConfig': 'mobilebert',\n",
       " 'MobileNetV1Config': 'mobilenet_v1',\n",
       " 'MobileNetV2Config': 'mobilenet_v2',\n",
       " 'MobileViTConfig': 'mobilevit',\n",
       " 'MobileViTV2Config': 'mobilevitv2',\n",
       " 'MPNetConfig': 'mpnet',\n",
       " 'MptConfig': 'mpt',\n",
       " 'MraConfig': 'mra',\n",
       " 'MT5Config': 'mt5',\n",
       " 'MusicgenConfig': 'musicgen',\n",
       " 'MusicgenMelodyConfig': 'musicgen_melody',\n",
       " 'MvpConfig': 'mvp',\n",
       " 'NatConfig': 'nat',\n",
       " 'NezhaConfig': 'nezha',\n",
       " 'NllbMoeConfig': 'nllb-moe',\n",
       " 'VisionEncoderDecoderConfig': 'vision-encoder-decoder',\n",
       " 'NystromformerConfig': 'nystromformer',\n",
       " 'OlmoConfig': 'olmo',\n",
       " 'OneFormerConfig': 'oneformer',\n",
       " 'OpenLlamaConfig': 'open-llama',\n",
       " 'OpenAIGPTConfig': 'openai-gpt',\n",
       " 'OPTConfig': 'opt',\n",
       " 'Owlv2Config': 'owlv2',\n",
       " 'OwlViTConfig': 'owlvit',\n",
       " 'PaliGemmaConfig': 'paligemma',\n",
       " 'PatchTSMixerConfig': 'patchtsmixer',\n",
       " 'PatchTSTConfig': 'patchtst',\n",
       " 'PegasusConfig': 'pegasus',\n",
       " 'PegasusXConfig': 'pegasus_x',\n",
       " 'PerceiverConfig': 'perceiver',\n",
       " 'PersimmonConfig': 'persimmon',\n",
       " 'PhiConfig': 'phi',\n",
       " 'Phi3Config': 'phi3',\n",
       " 'Pix2StructConfig': 'pix2struct',\n",
       " 'PLBartConfig': 'plbart',\n",
       " 'PoolFormerConfig': 'poolformer',\n",
       " 'Pop2PianoConfig': 'pop2piano',\n",
       " 'ProphetNetConfig': 'prophetnet',\n",
       " 'PvtConfig': 'pvt',\n",
       " 'PvtV2Config': 'pvt_v2',\n",
       " 'QDQBertConfig': 'qdqbert',\n",
       " 'Qwen2Config': 'qwen2',\n",
       " 'Qwen2MoeConfig': 'qwen2_moe',\n",
       " 'RagConfig': 'rag',\n",
       " 'RealmConfig': 'realm',\n",
       " 'RecurrentGemmaConfig': 'recurrent_gemma',\n",
       " 'ReformerConfig': 'reformer',\n",
       " 'RegNetConfig': 'regnet',\n",
       " 'RemBertConfig': 'rembert',\n",
       " 'ResNetConfig': 'resnet',\n",
       " 'RetriBertConfig': 'retribert',\n",
       " 'RobertaConfig': 'roberta',\n",
       " 'RobertaPreLayerNormConfig': 'roberta-prelayernorm',\n",
       " 'RoCBertConfig': 'roc_bert',\n",
       " 'RoFormerConfig': 'roformer',\n",
       " 'RTDetrConfig': 'rt_detr',\n",
       " 'RTDetrResNetConfig': 'rt_detr_resnet',\n",
       " 'RwkvConfig': 'rwkv',\n",
       " 'SamConfig': 'sam',\n",
       " 'SeamlessM4TConfig': 'seamless_m4t',\n",
       " 'SeamlessM4Tv2Config': 'seamless_m4t_v2',\n",
       " 'SegformerConfig': 'segformer',\n",
       " 'SegGptConfig': 'seggpt',\n",
       " 'SEWConfig': 'sew',\n",
       " 'SEWDConfig': 'sew-d',\n",
       " 'SiglipConfig': 'siglip',\n",
       " 'SiglipVisionConfig': 'siglip_vision_model',\n",
       " 'SpeechEncoderDecoderConfig': 'speech-encoder-decoder',\n",
       " 'Speech2TextConfig': 'speech_to_text',\n",
       " 'Speech2Text2Config': 'speech_to_text_2',\n",
       " 'SpeechT5Config': 'speecht5',\n",
       " 'SplinterConfig': 'splinter',\n",
       " 'SqueezeBertConfig': 'squeezebert',\n",
       " 'StableLmConfig': 'stablelm',\n",
       " 'Starcoder2Config': 'starcoder2',\n",
       " 'SuperPointConfig': 'superpoint',\n",
       " 'SwiftFormerConfig': 'swiftformer',\n",
       " 'SwinConfig': 'swin',\n",
       " 'Swin2SRConfig': 'swin2sr',\n",
       " 'Swinv2Config': 'swinv2',\n",
       " 'SwitchTransformersConfig': 'switch_transformers',\n",
       " 'T5Config': 't5',\n",
       " 'TableTransformerConfig': 'table-transformer',\n",
       " 'TapasConfig': 'tapas',\n",
       " 'TimeSeriesTransformerConfig': 'time_series_transformer',\n",
       " 'TimesformerConfig': 'timesformer',\n",
       " 'TimmBackboneConfig': 'timm_backbone',\n",
       " 'TrajectoryTransformerConfig': 'trajectory_transformer',\n",
       " 'TransfoXLConfig': 'transfo-xl',\n",
       " 'TrOCRConfig': 'trocr',\n",
       " 'TvltConfig': 'tvlt',\n",
       " 'TvpConfig': 'tvp',\n",
       " 'UdopConfig': 'udop',\n",
       " 'UMT5Config': 'umt5',\n",
       " 'UniSpeechConfig': 'unispeech',\n",
       " 'UniSpeechSatConfig': 'unispeech-sat',\n",
       " 'UnivNetConfig': 'univnet',\n",
       " 'UperNetConfig': 'upernet',\n",
       " 'VanConfig': 'van',\n",
       " 'VideoLlavaConfig': 'video_llava',\n",
       " 'VideoMAEConfig': 'videomae',\n",
       " 'ViltConfig': 'vilt',\n",
       " 'VipLlavaConfig': 'vipllava',\n",
       " 'VisionTextDualEncoderConfig': 'vision-text-dual-encoder',\n",
       " 'VisualBertConfig': 'visual_bert',\n",
       " 'ViTConfig': 'vit',\n",
       " 'ViTHybridConfig': 'vit_hybrid',\n",
       " 'ViTMAEConfig': 'vit_mae',\n",
       " 'ViTMSNConfig': 'vit_msn',\n",
       " 'VitDetConfig': 'vitdet',\n",
       " 'VitMatteConfig': 'vitmatte',\n",
       " 'VitsConfig': 'vits',\n",
       " 'VivitConfig': 'vivit',\n",
       " 'Wav2Vec2Config': 'wav2vec2',\n",
       " 'Wav2Vec2BertConfig': 'wav2vec2-bert',\n",
       " 'Wav2Vec2ConformerConfig': 'wav2vec2-conformer',\n",
       " 'WavLMConfig': 'wavlm',\n",
       " 'WhisperConfig': 'whisper',\n",
       " 'XCLIPConfig': 'xclip',\n",
       " 'XGLMConfig': 'xglm',\n",
       " 'XLMConfig': 'xlm',\n",
       " 'XLMProphetNetConfig': 'xlm-prophetnet',\n",
       " 'XLMRobertaConfig': 'xlm-roberta',\n",
       " 'XLMRobertaXLConfig': 'xlm-roberta-xl',\n",
       " 'XLNetConfig': 'xlnet',\n",
       " 'XmodConfig': 'xmod',\n",
       " 'YolosConfig': 'yolos',\n",
       " 'YosoConfig': 'yoso',\n",
       " 'ZoeDepthConfig': 'zoedepth'}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "CONFIG_TO_TYPE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "for config, tokenizers in TOKENIZER_MAPPING._extra_content.items():\n",
    "    print(config,tokenizers)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LlamaTokenizerFast(name_or_path='/data/models/modelscope/modelscope/Llama-2-7b-ms', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
       "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t32000: AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),\n",
       "}"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "pretrain_model_dir = \"/data/models/modelscope/modelscope/Llama-2-7b-ms\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(pretrain_model_dir, tokenizer_type=\"llama\")\n",
    "tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "pretrain_model_dir = \"/data/models/modelscope/modelscope/Llama-2-7b-ms\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(pretrain_model_dir)\n",
    "tokenizer\n",
    "tokenizer.pad_token_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "LlamaTokenizerFast(name_or_path='/data/models/modelscope/modelscope/Llama-2-7b-ms', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
       "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
       "\t32000: AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),\n",
       "}"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 通过下面的修改，使得tokenizer的属性有所调整，包括原来'pad_token': '<unk>' （id=0） 'pad_token': '</s>' （id=2）\n",
    "# 还包括padding_side='left' 修改为 padding_side='right'\n",
    "tokenizer.padding_side = \"right\"  # 1、一定要设置padding_side为right，否则batch大于1时可能不收敛\n",
    "# 7 为了避免loss出现nan的情况，调整 pad_token_id 从 0 改为 2\n",
    "tokenizer.pad_token_id = 2\n",
    "tokenizer\n",
    "\"\"\"\n",
    "LlamaTokenizerFast(\n",
    "    name_or_path='/data/models/modelscope/modelscope/Llama-2-7b-ms', \n",
    "    vocab_size=32000, \n",
    "    model_max_length=1000000000000000019884624838656, \n",
    "    is_fast=True, \n",
    "    padding_side='left', (现已经改为 right )\n",
    "    truncation_side='right', \n",
    "    special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<unk>'  (现已经改为 </s>)}, \n",
    "    clean_up_tokenization_spaces=False),  added_tokens_decoder={\n",
    "\t0: AddedToken(\"<unk>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
    "\t1: AddedToken(\"<s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
    "\t2: AddedToken(\"</s>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),\n",
    "\t32000: AddedToken(\"<pad>\", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),\n",
    "}\n",
    "\"\"\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>\n",
      "<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>\n"
     ]
    }
   ],
   "source": [
    "# 1、通过名称确定类\n",
    "from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES,tokenizer_class_from_name\n",
    "tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(\"llama\", None)\n",
    "tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple\n",
    "tokenizer_class_fast = tokenizer_class_from_name(tokenizer_fast_class_name)\n",
    "print(tokenizer_class_fast)\n",
    "tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)\n",
    "print(tokenizer_class)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'add_bos_token': True,\n",
       " 'add_eos_token': False,\n",
       " 'bos_token': {'__type': 'AddedToken',\n",
       "  'content': '<s>',\n",
       "  'lstrip': False,\n",
       "  'normalized': True,\n",
       "  'rstrip': False,\n",
       "  'single_word': False},\n",
       " 'clean_up_tokenization_spaces': False,\n",
       " 'eos_token': {'__type': 'AddedToken',\n",
       "  'content': '</s>',\n",
       "  'lstrip': False,\n",
       "  'normalized': True,\n",
       "  'rstrip': False,\n",
       "  'single_word': False},\n",
       " 'legacy': False,\n",
       " 'model_max_length': 1000000000000000019884624838656,\n",
       " 'pad_token': None,\n",
       " 'sp_model_kwargs': {},\n",
       " 'tokenizer_class': 'LlamaTokenizer',\n",
       " 'unk_token': {'__type': 'AddedToken',\n",
       "  'content': '<unk>',\n",
       "  'lstrip': False,\n",
       "  'normalized': True,\n",
       "  'rstrip': False,\n",
       "  'single_word': False},\n",
       " '_commit_hash': None}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers.models.auto.tokenization_auto import get_tokenizer_config\n",
    "# 中间用的下载还是什么方法不用去管这个细节\n",
    "tokenizer_config = get_tokenizer_config(pretrain_model_dir)\n",
    "# 另外的方法获取tokenizer的配置信息\n",
    "#  'tokenizer_class': 'LlamaTokenizer', 这里 tokenizer_config有一个key LlamaTokenizer\n",
    "\"\"\"方法如下：\n",
    "1、读取一个文件 TOKENIZER_CONFIG_FILE = \"tokenizer_config.json\" 该文件内容是json，因此可以依据该文件读取到分词器的配置信息\n",
    "{\n",
    "  \"add_bos_token\": true,\n",
    "  \"add_eos_token\": false,\n",
    "  \"bos_token\": {\n",
    "    \"__type\": \"AddedToken\",\n",
    "    \"content\": \"<s>\",\n",
    "    \"lstrip\": false,\n",
    "    \"normalized\": true,\n",
    "    \"rstrip\": false,\n",
    "    \"single_word\": false\n",
    "  },\n",
    "  \"clean_up_tokenization_spaces\": false,\n",
    "  \"eos_token\": {\n",
    "    \"__type\": \"AddedToken\",\n",
    "    \"content\": \"</s>\",\n",
    "    \"lstrip\": false,\n",
    "    \"normalized\": true,\n",
    "    \"rstrip\": false,\n",
    "    \"single_word\": false\n",
    "  },\n",
    "  \"legacy\": false,\n",
    "  \"model_max_length\": 1000000000000000019884624838656,\n",
    "  \"pad_token\": null,\n",
    "  \"sp_model_kwargs\": {},\n",
    "  \"tokenizer_class\": \"LlamaTokenizer\",\n",
    "  \"unk_token\": {\n",
    "    \"__type\": \"AddedToken\",\n",
    "    \"content\": \"<unk>\",\n",
    "    \"lstrip\": false,\n",
    "    \"normalized\": true,\n",
    "    \"rstrip\": false,\n",
    "    \"single_word\": false\n",
    "  }\n",
    "}\n",
    "2、获取到配置中明确了的   \"tokenizer_class\": \"LlamaTokenizer\", 因此接下来的加载操作也会按照这个类进行加载。、\n",
    "3、 解析auto_map ,目前看来 auto_map 是可以配置的，但是我们发现原始配置文件中没有该配置，因此跳过。\n",
    "\"\"\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4、 根据配置信息加在自动配置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoConfig\n",
    "pretrained_model_name_or_path = pretrain_model_dir\n",
    "config = AutoConfig.from_pretrained(\n",
    "                        pretrained_model_name_or_path, trust_remote_code=True, **{}\n",
    "                    )\n",
    "config_tokenizer_class = config.tokenizer_class\n",
    "print(config_tokenizer_class)\n",
    "from transformers.models.auto.configuration_auto import AutoConfig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "True\n",
      "(<class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>, <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>)\n"
     ]
    }
   ],
   "source": [
    "# 如果是本地搜索tokenizer的类，则需要进行搜索并确认分词器的类型\n",
    "print(type(config) in TOKENIZER_MAPPING)\n",
    "print(TOKENIZER_MAPPING[type(config)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 5、 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# True 预训练模型的目录 has_local_code = True has_remote_code = False\n",
    "from transformers.models.auto.tokenization_auto import resolve_trust_remote_code\n",
    "trust_remote_code = True\n",
    "pretrained_model_name_or_path = pretrain_model_dir\n",
    "has_local_code, has_remote_code = True, False\n",
    "\n",
    "trust_remote_code = resolve_trust_remote_code(\n",
    "            trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code\n",
    ")\n",
    "trust_remote_code"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
